<?php
#这个文件统计每个问题中每个单词出现的次数，同时过滤掉一些不好的词汇
#这里title中出现的词汇进行3倍提权，防止问题标题被大量的内容覆盖了
if ($argc < 3) {
    die("Usage: php statWords.php titleFile ContentFile dstFile [ratio]\n");
}
else {
    $titleFile = $argv[1];
    $contentFile = $argv[2];
    $dstFile = $argv[3];
    if (!empty($argv[4])) {
        $ratio = $argv[4];
    }  else {
        $ratio = 3;
    }
}

$titleHandle = @fopen($titleFile, "r");
$contentHandle = @fopen($contentFile, "r");
$dstHandle = @fopen($dstFile, "w");
if ($titleHandle && $contentHandle && $dstHandle) {
    while (!feof($titleHandle)) {
        $buf1 = trim(fgets($titleHandle, 4096));
        $titleArr = explode(" ", $buf1);
        $buf2 = trim(fgets($contentHandle, 65535));
        if (mb_strlen($buf2) > 3000) $buf2 = mb_substr($buf2, 0, 3000, "UTF-8");
        $contentArr = explode(" ", $buf2);
        if ($titleArr[0] != $contentArr[0]) {
            echo $buf1."\n";
            echo $buf2."\n";
            exit(1);
        }
        $cnt = array();
        foreach ($titleArr as $key=>$item) {
            if (0 == $key) continue;
            if (strlen($item) < 2) continue;
            if (filterStr($item)) {
                if (array_key_exists($item,$cnt)) $cnt[$item] += $ratio;
                else  $cnt[$item] = $ratio;
            }
        }
        foreach ($contentArr as $key=>$item) {
            if (0 == $key) continue;
            if (strlen($item) < 2) continue;
            if (filterStr($item)) {
                if (array_key_exists($item, $cnt)) $cnt[$item] +=1;
                else  $cnt[$item] =1;
            }
        }
        fwrite($dstHandle, $titleArr[0]."\t");
        foreach ($cnt as $term=>$num) {
            fwrite($dstHandle, "$term@$num ");
        }
        if (!empty($buf1) || !empty($buf2)) fwrite($dstHandle, "\n");
        unset($cnt);
    }
    fclose($titleHandle);
    fclose($contentHandle);
    fclose($dstHandle);
}
else {
    die("open file error!\n");
}

function filterStr($str) { 
    if (strpos($str,'.php') !== false) return false; 
    if (strpos($str, 'px') !== false) return false;
    if (strpos($str, 'babytree.com') !== false) return false;
    if (strpos($str, '.jpg') !== false) return false;
    if (strpos($str, '@') !== false) return false;
    if (strpos($str, 'image') !== false) return false;
    if (strpos($str, 'font') !== false) return false;
    if (strpos($str, 'mailto') !== false) return false;
    if (strpos($str, 'padding') !== false) return false;
    if (strpos($str, 'tbody') !== false) return false;
    if (strpos($str, 'align') !== false) return false;
    if (strpos($str, '.com') !== false) return false;
    if (strpos($str, 'ask') !== false) return false;
    //color, strong gb2312, 楷体
    if (strpos($str, 'color') !== false) return false;
    if (strpos($str, 'strong') !== false) return false;
    if (strpos($str, 'gb2312') !== false) return false;
    if (mb_strpos($str, '楷体') !== false) return false;
    if (strpos($str, 'face') !== false) return false;
    if (strpos($str, '.html') !== false) return false;
    if (strpos($str, '-style') !== false) return false;
    if (strpos($str, '.gif') !== false) return false;
    if (strpos($str, '_pic_') !== false) return false;
    if (strpos($str, '-spacing') !== false) return false;
    return true;
}
